Analysis

Author

Michel Tarnow, Clara Lepsius, Gonzalo Cardenal

Published

November 22, 2023

Load packages

library(biomaRt)
library(limma)
library(affy)
library(preprocessCore)
library(UpSetR)
library(tidyverse)
library(RColorBrewer)
library(pheatmap)
library(edgeR)
library(DESeq2)
library(ggrepel)
library(pbapply)
library(viridis)
library(gplots)
library(msigdbr)
library(fgsea)
library(viridis)
library(UpSetR)
library(tximport)

Data import

Import TPM count data from rsem .genes.results files:

# my code

directory_path <-
  "~/OneDrive - ETH Zurich/CBB/FS1/Systems Genomics/sysgenom/data_new/gene_results/"
file_list <- list.files(directory_path)

expr <- data.frame(matrix(NA_integer_, nrow = 56884, ncol = 23))
names <- c()
for (i in file_list) {
  names <- append(names, unlist(strsplit(i, ".genes.results")))
}
colnames(expr) <- names
for (i in 1:length(file_list)) {
  file <- file.path(directory_path, file_list[i])
  df <- read.csv(file, sep = "\t")
  
  expr[, i] <- df$TPM
}

rownames(expr) <- df$gene_id

expr <- expr %>%
  rownames_to_column(var = "gene_name") %>%
  arrange(gene_name) %>%
  column_to_rownames(var = "gene_name")

head(expr)
# tutorial code
samples <- list.files("rsem")

expr2 <- sapply(samples, function(sample) {
  file <- paste0("rsem/", sample, "/", sample, ".genes.results")
  quant <- read.csv(file, sep = "\t", header = T)
  tpm2 <- setNames(quant$TPM, quant$gene_id)
  return(tpm2)
})

expr <- as.data.frame(expr2)

expr <- expr %>%
  rownames_to_column(var = "gene_name") %>%
  arrange(gene_name) %>%
  column_to_rownames(var = "gene_name")

head(expr)
                      SRR21355241 SRR21355242 SRR21355243 SRR21355246
ENSMUSG00000000001.5        19.49       21.47       20.63       16.42
ENSMUSG00000000003.16        0.00        0.00        0.00        0.00
ENSMUSG00000000028.16        1.36        1.21        1.43        0.52
ENSMUSG00000000031.19        0.00        0.16        0.00        0.50
ENSMUSG00000000037.18        0.00        0.07        0.18        0.15
ENSMUSG00000000049.12        1.05        1.06        1.65        0.33
                      SRR21355249 SRR21355250 SRR21355256 SRR21355360
ENSMUSG00000000001.5        19.74       16.93       21.91       19.17
ENSMUSG00000000003.16        0.00        0.00        0.00        0.00
ENSMUSG00000000028.16        1.57        0.45        1.48        1.29
ENSMUSG00000000031.19        0.00        0.00        0.14        0.08
ENSMUSG00000000037.18        0.05        0.21        0.26        0.12
ENSMUSG00000000049.12        1.00        0.68        1.29        0.71
                      SRR21355364 SRR21355366 SRR21355367 SRR21355374
ENSMUSG00000000001.5        19.85       19.81       22.34       23.04
ENSMUSG00000000003.16        0.00        0.00        0.00        0.00
ENSMUSG00000000028.16        1.26        0.74        1.37        1.26
ENSMUSG00000000031.19        0.00        0.22        0.00        0.00
ENSMUSG00000000037.18        0.00        0.05        0.23        0.12
ENSMUSG00000000049.12        1.19        1.29        0.45        1.11
                      SRR21355378 SRR21355380 SRR21355381 SRR21355382
ENSMUSG00000000001.5        23.86       17.59       16.23       19.77
ENSMUSG00000000003.16        0.00        0.00        0.00        0.00
ENSMUSG00000000028.16        1.58        0.38        1.21        1.98
ENSMUSG00000000031.19        0.16        0.00        0.00        0.10
ENSMUSG00000000037.18        0.09        0.33        0.18        0.17
ENSMUSG00000000049.12        1.22        0.74        1.49        1.55
                      SRR21355383 SRR21355696 SRR21355697 SRR21355698
ENSMUSG00000000001.5        20.20       23.96       20.37       24.05
ENSMUSG00000000003.16        0.00        0.00        0.00        0.00
ENSMUSG00000000028.16        0.92        2.01        1.40        1.21
ENSMUSG00000000031.19        0.00        0.00        0.00        0.00
ENSMUSG00000000037.18        0.12        0.14        0.04        0.15
ENSMUSG00000000049.12        0.79        1.39        1.17        0.00
                      SRR21355699 SRR21355703 SRR21355705
ENSMUSG00000000001.5        17.57       18.97       21.85
ENSMUSG00000000003.16        0.00        0.00        0.00
ENSMUSG00000000028.16        0.89        1.72        1.35
ENSMUSG00000000031.19        0.00        0.00        0.17
ENSMUSG00000000037.18        0.00        0.16        0.07
ENSMUSG00000000049.12        0.27        0.25        1.52
sum(expr != expr2)
[1] 0

Both are the same!

Import annotation data for the samples:

# read in meta data for the runs (contains sample and mouse id)
anno <- read.csv("/Users/gonuni/Desktop/College/CBB/1st Semester/Systems Genomics/Group_project/SraRunTable.txt", sep = ",")

# read in meta data for the samples (contains mouse id and age)
metadata <-
  read.csv("/Users/gonuni/Desktop/College/CBB/1st Semester/Systems Genomics/Group_project/BulkSeq_Aging_metadata.txt", sep = "\t")

# join information from anno and metadata to have age of mouse in each sample
age <- metadata %>%
  group_by(mouseID) %>%
  summarise(age)
ages <- c()
for (i in anno$mouse_id_number) {
  ages <- append(ages, age$age[age$mouseID == i])
}
anno$age <- ages
anno <- anno %>%
  select(Run, mouse_id_number, sex, age, everything())

anno <- anno[order(anno$Run),]
head(anno)
           Run mouse_id_number  sex age Assay.Type AvgSpotLen      Bases
7  SRR21355241              35 male  18    RNA-Seq        300 6477564300
8  SRR21355242              34 male  15    RNA-Seq        300 4162732500
9  SRR21355243              33 male   3    RNA-Seq        300 3252736800
10 SRR21355246              30 male  15    RNA-Seq        300 2338102800
11 SRR21355249              28 male  21    RNA-Seq        300 7603170300
12 SRR21355250              27 male  18    RNA-Seq        300 4418850300
    BioProject    BioSample      Bytes
7  PRJNA875066 SAMN30602244 2380886548
8  PRJNA875066 SAMN30602245 1551355324
9  PRJNA875066 SAMN30602246 1224759082
10 PRJNA875066 SAMN30602249  879784428
11 PRJNA875066 SAMN30602252 2814959314
12 PRJNA875066 SAMN30602253 1611542706
                                                             Center.Name
7  TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
8  TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
9  TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
10 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
11 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
12 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
   Consent DATASTORE.filetype DATASTORE.provider               DATASTORE.region
7   public   run.zq,fastq,sra         s3,ncbi,gs s3.us-east-1,gs.US,ncbi.public
8   public   sra,fastq,run.zq         s3,gs,ncbi s3.us-east-1,gs.US,ncbi.public
9   public   fastq,sra,run.zq         s3,gs,ncbi gs.US,ncbi.public,s3.us-east-1
10  public   run.zq,sra,fastq         s3,gs,ncbi gs.US,s3.us-east-1,ncbi.public
11  public   run.zq,sra,fastq         s3,ncbi,gs gs.US,s3.us-east-1,ncbi.public
12  public   run.zq,fastq,sra         gs,ncbi,s3 gs.US,s3.us-east-1,ncbi.public
    Experiment            Instrument Library.Name LibraryLayout
7  SRX17360880 Illumina NovaSeq 6000   GSM6527217        PAIRED
8  SRX17360879 Illumina NovaSeq 6000   GSM6527216        PAIRED
9  SRX17360878 Illumina NovaSeq 6000   GSM6527215        PAIRED
10 SRX17360875 Illumina NovaSeq 6000   GSM6527211        PAIRED
11 SRX17360872 Illumina NovaSeq 6000   GSM6527208        PAIRED
12 SRX17360871 Illumina NovaSeq 6000   GSM6527207        PAIRED
   LibrarySelection  LibrarySource     Organism Platform          ReleaseDate
7              cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
8              cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
9              cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
10             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
11             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
12             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
            create_date version Sample.Name  source_name SRA.Study    strain
7  2022-09-01T02:54:00Z       1  GSM6527217 Motor cortex SRP394927 C57BL/6JN
8  2022-09-01T02:37:00Z       1  GSM6527216 Motor cortex SRP394927 C57BL/6JN
9  2022-09-01T02:37:00Z       1  GSM6527215 Motor cortex SRP394927 C57BL/6JN
10 2022-09-01T02:32:00Z       1  GSM6527211 Motor cortex SRP394927 C57BL/6JN
11 2022-09-01T03:09:00Z       1  GSM6527208 Motor cortex SRP394927 C57BL/6JN
12 2022-09-01T02:45:00Z       1  GSM6527207 Motor cortex SRP394927 C57BL/6JN
         tissue
7  Motor cortex
8  Motor cortex
9  Motor cortex
10 Motor cortex
11 Motor cortex
12 Motor cortex

This is also the reason why we need multiple replicates of the each conditions; or the experiment would need to be designed in a way that samples from “different conditions” can be seen as “cross-replicates”, for instance, a reasonale number of samples all with different time points to study transcriptomic changes across the time course, so that we see the time course as a continuous condition rather than each individual time point as one distinct condition.

table(anno$age)

 3 12 15 18 21 26 28 
 4  5  4  4  4  1  1 

Import additional gene information:

devtools::install_version("dbplyr", version = "2.3.4") httr::set_config(httr::config(ssl_verifypeer = FALSE))

ensembl <- useEnsembl(biomart = "ensembl",
                      dataset = "mmusculus_gene_ensembl")

meta_genes <- getBM(
  attributes = c(
    "ensembl_gene_id",
    "ensembl_gene_id_version",
    "external_gene_name",
    "description",
    "chromosome_name",
    "start_position",
    "end_position",
    "strand"
  ),
  filters = "ensembl_gene_id_version",
  values = rownames(expr),
  mart = ensembl
) %>%
  right_join(data.frame(ensembl_gene_id_version = rownames(expr)),
             by = "ensembl_gene_id_version") %>%
  distinct(ensembl_gene_id_version, .keep_all = TRUE)

expr <- expr[meta_genes$ensembl_gene_id_version, ]
rownames(meta_genes) <- meta_genes$ensembl_gene_id_version

Exploratory data analysis

dim(expr)
[1] 56884    23
dim(meta_genes)
[1] 56884     8

We have 56884 annotated genes in 23 samples.

avg_expr <- rowMeans(expr)

layout(matrix(1:2, nrow = 1))
hist(avg_expr)
hist(log10(avg_expr + 1))

ggplot(data = as.data.frame(avg_expr), mapping =  aes(x = avg_expr)) +
  geom_histogram(
    color = "white",
    fill = brewer.pal(n = 3, name = "Set1")[2],
    bins = 50
  ) +
  labs(title = "Distribution of average expression values of all genes",
       x = "Average expression",
       y = "Count") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(data = as.data.frame(avg_expr), mapping =  aes(x = log(avg_expr + 1))) +
  geom_histogram(
    color = "white",
    fill = brewer.pal(n = 3, name = "Set1")[2],
    bins = 50
  ) +
  labs(title = "Distribution of average expression values of all genes",
       x = "log10(Average expression + 1)",
       y = "Count") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(data = as.data.frame(avg_expr), mapping =  aes(x = avg_expr)) +
  geom_histogram(
    color = "white",
    fill = brewer.pal(n = 3, name = "Set1")[2],
    bins = 50
  ) +
  scale_x_continuous(
    breaks = c(0, 1, 10, 100, 1000, 10000, 20000),
    trans = "log1p",
    expand = c(0, 0)
  ) +
  scale_y_continuous(breaks = c(0, 1),
                     expand = c(0, 0),
                     trans = "log1p") +
  labs(title = "Distribution of average expression values of all genes",
       x = "log1p(Average expression)",
       y = "log1p(Count)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

num_det <- rowSums(expr > 0)

ggplot(data = as.data.frame(num_det), mapping =  aes(x = num_det)) +
  geom_histogram(
    color = "white",
    fill = brewer.pal(n = 3, name = "Set1")[2],
    bins = 23
  ) +
  labs(title = "Distribution of number samples in which each gene is detected",
       x = "Number of samples",
       y = "Count") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

hist(num_det)

The histograms show that many genes are not detected in all or most of the samples, or detected only with low expression. Thus, these genes are filtered out since they are probably not of interest:

# filter expressed genes and add this information to the meta_genes file
# threshold: genes must be detected in at least half of the samples
#            and the average TPM must be >= 1

expressed <- rowMeans(expr > 0) >= 0.5 | rowMeans(expr) >= 1
meta_genes$expressed <- expressed

num_det <- rowSums(expr[meta_genes$expressed,] > 0)

ggplot(data = as.data.frame(num_det), mapping =  aes(x = num_det)) +
  geom_histogram(
    color = "white",
    fill = brewer.pal(n = 3, name = "Set1")[2],
    bins = 23
  ) +
  labs(title = "Distribution of number samples in which each gene is detected",
       x = "Number of samples",
       y = "Count") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

hist(num_det)

# compute pairwise correlation between the samples

corr_pearson <- cor(log1p(expr[meta_genes$expressed,]))
corr_spearman <- cor(expr[meta_genes$expressed,], method = "spearman")

pheatmap(corr_pearson)

pheatmap(corr_spearman)

hcl_pearson <- hclust(as.dist(1 - corr_pearson))
hcl_spearman <- hclust(as.dist(1 - corr_spearman))

plot(hcl_pearson, labels = anno$age)

plot(hcl_spearman, labels = anno$age)

# PCA dimensionality reduction

pca <-
  prcomp(log1p(t(expr[meta_genes$expressed, ])), center = TRUE, scale. = TRUE)

eigs <- pca$sdev^2
plot(1:length(eigs), eigs)

ggplot(data = as.data.frame(pca$x),
       mapping = aes(
         x = PC1,
         y = PC2,
         #color = as.factor(anno$age)
         color = anno$age,
         shape = anno$sex
       )) +
  geom_point(size = 4) +
  labs(title = "PCA plot", color = "Age", shape = "Sex") +
  scale_color_viridis_c() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# MDS dimensionality reduction

mds <- plotMDS(log1p(expr[meta_genes$expressed,]), plot = FALSE)

df <- data.frame(MDS1 = mds$x,
                 MDS2 = mds$y,
                 age = anno$age,
                 shape = anno$sex)
ggplot(df, aes(x = MDS1, y = MDS2, colour = age)) +
  geom_point(size = 4) +
  labs(title = "MDS plot", color = "Age") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_viridis_c()

Highly variable genes identification

Function to estimate the variability of the genes:

estimate_variability <- function(expr){
  means <- apply(expr, 1, mean)
  vars <- apply(expr, 1, var)
  cv2 <- vars / means^2
  
  minMeanForFit <- unname(median(means[which(cv2 > 0.3)]))
  useForFit <- means >= minMeanForFit
  fit <- glm.fit(x = cbind(a0 = 1, a1tilde = 1/means[useForFit]),
                 y = cv2[useForFit],
                 family = Gamma(link = "identity"))
  a0 <- unname(fit$coefficients["a0"])
  a1 <- unname(fit$coefficients["a1tilde"])
  
  xg <- exp(seq(min(log(means[means>0])), max(log(means)), length.out=1000))
  vfit <- a1/xg + a0
  df <- ncol(expr) - 1
  afit <- a1/means+a0
  varFitRatio <- vars/(afit*means^2)
  pval <- pchisq(varFitRatio*df,df=df,lower.tail=F)
  
  res <- data.frame(mean = means,
                    var = vars,
                    cv2 = cv2,
                    useForFit = useForFit,
                    pval = pval,
                    padj = p.adjust(pval, method="BH"),
                    row.names = rownames(expr))
  return(res)
}

Test for significance of over dispersion:

var_genes <- estimate_variability(expr[meta_genes$expressed, ])
meta_genes$highvar <-
  meta_genes$ensembl_gene_id_version %in% rownames(var_genes)[which(var_genes$padj < 0.05)]

Hierarchical clustering and PCA only with highly variable and expressed genes:

corr_spearman_highvar <-
  cor(expr[meta_genes$highvar,], method = "spearman")
hcl_spearman_highvar <- hclust(as.dist(1 - corr_spearman_highvar))
plot(hcl_spearman_highvar, labels = anno$age)

pca_highvar <-
  prcomp(log1p(t(expr[meta_genes$highvar,])), center = TRUE, scale. = TRUE)
ggplot(data = as.data.frame(pca_highvar$x),
       mapping = aes(x = PC1,
                     y = PC2,
                     color = anno$age,
                     shape = anno$sex)) +
  geom_point(size = 4) +
  labs(title = "PCA plot (only highly variable genes)", color = "Age", shape = "Sex") +
  scale_color_viridis_c() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Differential expression analysis

To identify significant differential expression changes with age, we used the raw count matrix as recommended for the DEseq2 standard analysis pipeline. Factors and dispersion estimates were calculated for each region separately. We conducted differential expression analysis comparing samples from 3 months to each consecutive time point, using sex as covariate. This is consistent with previously published differential expression analyses performed across whole organs in mice.8 P values were adjusted for multiple testing, and genes with an adjusted P value of less than 0.05 were determined to be statistically significant. Finally, we required a gene to reach statistical significance (after multiple testing correction) in at least 2 pairwise comparisons (e.g. 3 months vs 18 months and 3 months vs 21 months) to be called a differentially expressed gene (DEG). We chose this criterion to retain only genes with robust differential expression patterns across age groups. We recognize that this tends to select against genes that are differentially expressed very late in life (i.e. 3 months vs 28 months).

ANOVA

DE_test <- function(expr,
                    cond,
                    ctrl = NULL,
                    covar = NULL,
                    padj_method = p.adjust.methods){
  pval_fc <- data.frame(t(pbapply(expr, 1, function(e){
    dat <- data.frame(y = log1p(e),
                      cond = cond)
    if (! is.null(covar))
      dat <- data.frame(dat, covar)
    
    m1 <- lm(y ~ ., data = dat)
    m0 <- lm(y ~ . - cond, data = dat)
    test <- anova(m1, m0)
    pval <- test$Pr[2]
    
    avgs <- tapply(log1p(e), cond, mean)
    if (! is.null(ctrl) && sum(cond %in% ctrl) > 0){
      fc <- exp(max(avgs[names(avgs) != ctrl]) - avgs[ctrl])
    } else{
      fc <- exp(max(avgs) - min(avgs))
    }
    
    return(c(pval = unname(pval), fc = unname(fc)))
  })), row.names = rownames(expr))
  padj <- p.adjust(pval_fc$pval, method = padj_method)
  return(data.frame(pval_fc, "padj" = padj)[,c("pval","padj","fc")])
}

# expressed genes
res_DE <- DE_test(expr = expr[meta_genes$expressed,],
                  cond = anno$age,
                  covar = anno$sex) %>%
  tibble::rownames_to_column("gene")
res_DE <- res_DE %>%
  mutate(DE = padj < 0.1 & fc > 2) %>%
  mutate(DEG = ifelse(DE, meta_genes$external_gene_name[meta_genes$expressed], NA))

ggplot(res_DE, aes(
  x = log(fc),
  y = -log10(padj),
  col = DE,
  label = DEG
)) +
  geom_point() +
  geom_text_repel() +
  geom_vline(
    xintercept = c(log(2), 0),
    col = "#303030",
    linetype = "dotted"
  ) +
  geom_hline(
    yintercept = -log10(0.1),
    col = "#303030",
    linetype = "dotted"
  ) +
  scale_color_manual(values = c("#909090", "red")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(title = "ANOVA differential expression analysis", 
       x = "logFC", 
       y = "-log10(p-value)")
Warning: Removed 24353 rows containing missing values (`geom_text_repel()`).

DESeq2

tx2gene <- getBM(attributes = c("ensembl_transcript_id_version",
                                "ensembl_gene_id_version"),
                 filters = "ensembl_gene_id_version",
                 values = rownames(expr),
                 mart = ensembl) %>%
  dplyr::select(ensembl_transcript_id_version, ensembl_gene_id_version)

samples <-  list.files("rsem")
files <- file.path("rsem", samples, paste0(samples,".isoforms.results"))
txi <- tximport(files, type = "rsem", tx2gene = tx2gene)
reading in files with read_tsv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 
summarizing abundance
summarizing counts
summarizing length
dds <- DESeqDataSetFromTximport(txi,
                                colData = anno,
                                design = ~ age + sex)
Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
design formula are characters, converting to factors
  the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function
  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  Including numeric variables with large mean can induce collinearity with the intercept.
  Users should center and scale numeric variables in the design to improve GLM convergence.
using counts and average transcript lengths from tximport
dds_filtered <-
  dds[intersect(rownames(expr)[meta_genes$expressed], rownames(dds)), ]
dds_filtered <- DESeq(dds_filtered, test = "LRT", reduced = ~ sex)
estimating size factors
using 'avgTxLength' from assays(dds), correcting for library size
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
res_DESeq2 <- results(dds_filtered)
padj <- p.adjust(res_DESeq2$pvalue, method = "holm") < 0.1
de_genes_deseq2 <- rownames(res_DESeq2)[padj]
de_genes_deseq2 <- as.vector(na.omit(de_genes_deseq2))
de_genes_deseq2_ext_name <-
  meta_genes[de_genes_deseq2,]$external_gene_name

edgeR

samples <- list.files("rsem")

counts <- sapply(samples, function(sample) {
  file <- paste0("rsem/", sample, "/", sample, ".genes.results")
  quant <- read.csv(file, sep = "\t", header = T)
  tpm2 <- setNames(quant$expected_count, quant$gene_id)
  return(tpm2)
})

counts <- as.data.frame(counts)

counts <- counts %>% 
  rownames_to_column(var = "gene_name") %>% 
  arrange(gene_name) %>% 
  column_to_rownames(var = "gene_name")

head(counts)
                      SRR21355241 SRR21355242 SRR21355243 SRR21355246
ENSMUSG00000000001.5          445         305         226         149
ENSMUSG00000000003.16           0           0           0           0
ENSMUSG00000000028.16          18          11          10           3
ENSMUSG00000000031.19           0           1           0           2
ENSMUSG00000000037.18           0           1           3           2
ENSMUSG00000000049.12           8           5           6           1
                      SRR21355249 SRR21355250 SRR21355256 SRR21355360
ENSMUSG00000000001.5          531         297         555         408
ENSMUSG00000000003.16           0           0           0           0
ENSMUSG00000000028.16          27           5          24          14
ENSMUSG00000000031.19           0           0           2           1
ENSMUSG00000000037.18           2           4           8           4
ENSMUSG00000000049.12           9           4          11           5
                      SRR21355364 SRR21355366 SRR21355367 SRR21355374
ENSMUSG00000000001.5          551         418         894         373
ENSMUSG00000000003.16           0           0           0           0
ENSMUSG00000000028.16          22           8          32          13
ENSMUSG00000000031.19           0           2           0           0
ENSMUSG00000000037.18           0           1          14           2
ENSMUSG00000000049.12          11           9           6           6
                      SRR21355378 SRR21355380 SRR21355381 SRR21355382
ENSMUSG00000000001.5          355         143         293         462
ENSMUSG00000000003.16           0           0           0           0
ENSMUSG00000000028.16          13           2          14          26
ENSMUSG00000000031.19           1           0           0           1
ENSMUSG00000000037.18           2           4           5           6
ENSMUSG00000000049.12           6           2           9          12
                      SRR21355383 SRR21355696 SRR21355697 SRR21355698
ENSMUSG00000000001.5          683         466         365         350
ENSMUSG00000000003.16           0           0           0           0
ENSMUSG00000000028.16          16          25          16           9
ENSMUSG00000000031.19           0           0           0           0
ENSMUSG00000000037.18           6           4           1           2
ENSMUSG00000000049.12           9           9           7           0
                      SRR21355699 SRR21355703 SRR21355705
ENSMUSG00000000001.5          193         224         606
ENSMUSG00000000003.16           0           0           0
ENSMUSG00000000028.16           5          13          24
ENSMUSG00000000031.19           0           0           2
ENSMUSG00000000037.18           0           2           3
ENSMUSG00000000049.12           1           1          14
# define groups for design matrix
age <- factor(anno$age)
sex <- factor(anno$sex)

# convert data to DGEList object
y <- DGEList(counts = counts[meta_genes$expressed,], group = age, sex = sex)

# normalize the data to account for differences in library sizes and sequencing depth
y <- calcNormFactors(y)

# create design matrix
design <- model.matrix( ~ age + sex)
design
   (Intercept) age12 age15 age18 age21 age26 age28 sexmale
1            1     0     0     1     0     0     0       1
2            1     0     1     0     0     0     0       1
3            1     0     0     0     0     0     0       1
4            1     0     1     0     0     0     0       1
5            1     0     0     0     1     0     0       1
6            1     0     0     1     0     0     0       1
7            1     0     0     0     0     0     0       1
8            1     0     0     0     0     0     1       1
9            1     0     0     0     0     0     0       0
10           1     0     0     1     0     0     0       0
11           1     0     1     0     0     0     0       0
12           1     0     0     0     1     0     0       0
13           1     0     0     0     1     0     0       0
14           1     0     0     1     0     0     0       0
15           1     0     1     0     0     0     0       0
16           1     0     0     0     0     0     0       0
17           1     0     0     0     1     0     0       1
18           1     1     0     0     0     0     0       1
19           1     0     0     0     0     1     0       1
20           1     1     0     0     0     0     0       0
21           1     1     0     0     0     0     0       1
22           1     1     0     0     0     0     0       1
23           1     1     0     0     0     0     0       0
attr(,"assign")
[1] 0 1 1 1 1 1 1 2
attr(,"contrasts")
attr(,"contrasts")$age
[1] "contr.treatment"

attr(,"contrasts")$sex
[1] "contr.treatment"
# estimate dispersion
y <- estimateDisp(y, design)

# fit the negative binomial model
fit <- glmFit(y, design)

# conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrt_all <- glmLRT(fit, coef = 2:7)
topTags(lrt_all)
Coefficient:  age12 age15 age18 age21 age26 age28 
                      logFC.age12 logFC.age15 logFC.age18 logFC.age21
ENSMUSG00000109006.4   0.42549918  0.64432382  0.72138263  0.72678372
ENSMUSG00000092178.3   2.70228069  2.83114302  3.43410901  3.64978902
ENSMUSG00000052229.6  -0.59373785 -0.69719440 -0.92894309 -1.06495887
ENSMUSG00000051242.2   0.67363096  0.74441494  0.85017560  0.96052688
ENSMUSG00000103897.2   0.81683705  0.47975724  0.43610018  0.87429764
ENSMUSG00000073418.5   0.28577065  0.76687301  0.69784255  1.69148018
ENSMUSG00000004894.11  0.35355271  0.80515826  0.59799117  1.00051971
ENSMUSG00000022076.11 -1.07314215 -1.14384988 -1.51811213 -1.46217348
ENSMUSG00000102422.2   0.51006045 -0.50716334 -0.30847966  0.02707808
ENSMUSG00000075602.11  0.01113071 -0.01473805  0.06415556  0.41015285
                      logFC.age26 logFC.age28   logCPM        LR       PValue
ENSMUSG00000109006.4    0.6629856  0.85019534 6.715016 202.53240 5.484322e-41
ENSMUSG00000092178.3    3.8196167  4.23159934 2.139035 138.00937 2.636076e-27
ENSMUSG00000052229.6   -1.6205104 -1.19079464 4.239970 127.23686 4.905170e-25
ENSMUSG00000051242.2    0.8819022  1.29168983 4.440796  92.90605 7.540477e-18
ENSMUSG00000103897.2    1.0979546  0.91302672 4.382299  82.47698 1.098958e-15
ENSMUSG00000073418.5    1.6704738  2.70322000 3.220873  80.83753 2.398670e-15
ENSMUSG00000004894.11   0.9139761  1.28641546 3.724848  80.52670 2.781013e-15
ENSMUSG00000022076.11  -2.0742497 -2.35490774 1.708595  68.93775 6.750994e-13
ENSMUSG00000102422.2   -8.1383376 -1.72703022 2.182698  63.94556 7.080725e-12
ENSMUSG00000075602.11   0.8262527 -0.08940765 5.247879  63.81576 7.525836e-12
                               FDR
ENSMUSG00000109006.4  1.335816e-36
ENSMUSG00000092178.3  3.210345e-23
ENSMUSG00000052229.6  3.982508e-21
ENSMUSG00000051242.2  4.591585e-14
ENSMUSG00000103897.2  5.353464e-12
ENSMUSG00000073418.5  9.676733e-12
ENSMUSG00000004894.11 9.676733e-12
ENSMUSG00000022076.11 2.055424e-09
ENSMUSG00000102422.2  1.833068e-08
ENSMUSG00000075602.11 1.833068e-08
# multiple testing correction
decide_dif_all <-
  decideTests.DGELRT(
    lrt_all,
    adjust.method = "holm",
    p.value = 0.1#,
    #lfc = log(2)
  )

summary(decide_dif_all)
       age28-age26-age21-age18-age15-age12
NotSig                               24285
Sig                                     72
de_genes_edgeR <- rownames(decide_dif_all[decide_dif_all[,1] != 0,])
de_genes_edgeR_ext_name <- meta_genes[de_genes_edgeR,]$external_gene_name
de_genes_edgeR_ext_name
 [1] "Hapln2"        "Cabp7"         "Met"           "Bcas1"        
 [5] "Ctsz"          "Sparc"         "Gfap"          "Klhl1"        
 [9] "Pdlim2"        "Apod"          "Banp"          "Casp1"        
[13] "Snca"          "Lcn2"          "Gatm"          "Fermt1"       
[17] "Stmn2"         "Gnb4"          "Hmgcs2"        "Calb1"        
[21] "Tmeff1"        "Plekhb1"       "Mt1"           "Rims3"        
[25] "Zdhhc15"       "Camk4"         "Rgs4"          "Nrn1"         
[29] "Flywch1"       "Abca8a"        "Tmcc2"         "Kdm7a"        
[33] "Lsm11"         "Cirbp"         "Vwa5b2"        "Marcksl1"     
[37] "Eif5a2"        "Pcdhb9"        "Pcdhb2"        "Gpr17"        
[41] "H2-Q7"         "Cadm2"         "Kndc1"         "Cst7"         
[45] "Lyz2"          "Gm20634"       "2410018L13Rik" "H2-Q6"        
[49] "C4b"           "Ly6a"          "Ass1"          "Igkc"         
[53] "Gm4294"        "Ifi27l2a"      "Ly6c1"         "Mdfic2"       
[57] "Gm1979"        "Tnnc1"         "Gm45351"       "Neat1"        
[61] "Obox3-ps7"     "Iqschfp"       "Pcdhga7"       "Pcdhga8"      
[65] "Gm42756"       "Gm42715"       "Gm44002"       "B230209E15Rik"
[69] "9330121K16Rik" "Gm32687"       "AA414992"      ""             
# comparison of 12 and 3 months

lrt <- glmLRT(fit, coef = 2)
decide_dif <-
  decideTests.DGELRT(
    lrt,
    adjust.method = "holm",
    p.value = 0.1,
    lfc = log(2)
  )

ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = abs(logFC),
    y = -log10(p.adjust(PValue, method = "holm")),
    label = meta_genes[rownames(y),]$external_gene_name,
    color = as.factor(decide_dif)
  )
) +
  geom_point() +
  geom_vline(xintercept = log(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.1), linetype = "dashed") +
  geom_text_repel() +
  labs(
    title = "Differential expression 3 vs. 12 months",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_brewer(palette = "Set1") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
Warning: ggrepel: 24298 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

# comparison of 21 and 3 months

lrt <- glmLRT(fit, coef = 5)
decide_dif <-
  decideTests.DGELRT(
    lrt,
    adjust.method = "holm",
    p.value = 0.1,
    lfc = log(2)
  )

ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = abs(logFC),
    y = -log10(p.adjust(PValue, method = "holm")),
    label = meta_genes[rownames(y),]$external_gene_name,
    color = as.factor(decide_dif)
  )
) +
  geom_point() +
  geom_vline(xintercept = log(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.1), linetype = "dashed") +
  geom_text_repel() +
  labs(
    title = "Differential expression 3 vs. 21 months",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_brewer(palette = "Set1") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
Warning: ggrepel: 24297 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

# comparison of 28 and 3 months

lrt <- glmLRT(fit, coef = 7)
decide_dif <-
  decideTests.DGELRT(
    lrt,
    adjust.method = "holm",
    p.value = 0.1,
    lfc = log(2)
  )

ggplot(
  data = as.data.frame(lrt$table),
  mapping = aes(
    x = abs(logFC),
    y = -log10(p.adjust(PValue, method = "holm")),
    label = meta_genes[rownames(y),]$external_gene_name,
    color = as.factor(decide_dif)
  )
) +
  geom_point() +
  geom_vline(xintercept = log(2), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.1), linetype = "dashed") +
  geom_text_repel() +
  labs(
    title = "Differential expression 3 vs. 28 months",
    x = "LogFC",
    y = "-10log(p-value)",
    color = "DE"
  ) +
  scale_color_brewer(palette = "Set1") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
Warning: ggrepel: 24301 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

c4b <-
  data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "C4b", ]$ensembl_gene_id_version,])), age = anno$age)
c4b <-
  c4b %>% group_by(age) %>% summarise(expression = mean(expression))
c4b
# A tibble: 7 × 2
    age expression
  <int>      <dbl>
1     3       1.25
2    12       1.56
3    15       2.04
4    18       1.92
5    21       4.28
6    26       3.84
7    28       7.47
gpr17 <-
  data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "Gpr17", ]$ensembl_gene_id_version,])), age = anno$age)
gpr17 <-
  gpr17 %>% group_by(age) %>% summarise(expression = mean(expression))
gpr17
# A tibble: 7 × 2
    age expression
  <int>      <dbl>
1     3       8.68
2    12       5.91
3    15       5.08
4    18       4.17
5    21       4.27
6    26       3.03
7    28       3.88
H2Q6 <-
  data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "H2-Q6", ]$ensembl_gene_id_version,])), age = anno$age)
H2Q6 <-
  H2Q6 %>% group_by(age) %>% summarise(expression = mean(expression))
H2Q6
# A tibble: 7 × 2
    age expression
  <int>      <dbl>
1     3      0.395
2    12      1.21 
3    15      1.61 
4    18      1.71 
5    21      2.44 
6    26      4.14 
7    28      2.1  
H2Q7 <-
  data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "H2-Q7", ]$ensembl_gene_id_version,])), age = anno$age)
H2Q7 <-
  H2Q7 %>% group_by(age) %>% summarise(expression = mean(expression))
H2Q7
# A tibble: 7 × 2
    age expression
  <int>      <dbl>
1     3      0.617
2    12      1.52 
3    15      1.97 
4    18      2.46 
5    21      3.34 
6    26      5.36 
7    28      1.89 
ggplot(data = c4b,
       mapping = aes(
         x = as.factor(age),
         y = expression,
         fill = as.factor(age)
       )) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_viridis_d() +
  labs(title = "Expression of C4b over time",
       x = "Age (months)",
       y = "TPM",
       fill = "Age")

ggplot(data = gpr17,
       mapping = aes(
         x = as.factor(age),
         y = expression,
         fill = as.factor(age)
       )) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_viridis_d() +
  labs(title = "Expression of Gpr17 over time",
       x = "Age (months)",
       y = "TPM",
       fill = "Age")

ggplot(data = H2Q6,
       mapping = aes(
         x = as.factor(age),
         y = expression,
         fill = as.factor(age)
       )) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_viridis_d() +
  labs(title = "Expression of H2-Q6 over time",
       x = "Age (months)",
       y = "TPM",
       fill = "Age")

ggplot(data = H2Q7,
       mapping = aes(
         x = as.factor(age),
         y = expression,
         fill = as.factor(age)
       )) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_viridis_d() +
  labs(title = "Expression of H2-Q7 over time",
       x = "Age (months)",
       y = "TPM",
       fill = "Age")

Comparison of DE analysis methods

UpSet plot:

eR <- decide_dif_all

d2 <- padj * 1
d2 <- ifelse(is.na(d2), 0, d2)
  
anov <- as.data.frame(res_DE$DE * 1)
rownames(anov) <- res_DE$gene

df <- cbind(eR, d2, anov)
colnames(df) <- c("edgeR", "DESeq2", "ANOVA")

upset(as.data.frame(df), sets = colnames(df))

intersect(de_genes_deseq2_ext_name, de_genes_edgeR_ext_name)
 [1] "Hapln2"        "Cabp7"         "Met"           "Ctsz"         
 [5] "Sparc"         "Klhl1"         "Apod"          "Fermt1"       
 [9] "Gnb4"          "Hmgcs2"        "Calb1"         "Camk4"        
[13] "Rgs4"          "Nrn1"          "Flywch1"       "Abca8a"       
[17] "Lsm11"         "Cirbp"         "Marcksl1"      "Pcdhb9"       
[21] "Pcdhb2"        "Gpr17"         "H2-Q7"         "Cadm2"        
[25] "Cst7"          "Lyz2"          "H2-Q6"         "C4b"          
[29] "Ass1"          "Gm45351"       "Neat1"         "Pcdhga7"      
[33] "Gm42756"       "B230209E15Rik" "AA414992"     

Comparison of p-values:

# combine LogFC and plot
df <- as.data.frame(cbind(p.adjust(lrt_all$table$PValue, method = "holm"), res_DE$padj))
colnames(df) <- c("padj_edgeR", "padj_anova")

ggplot(data = df, mapping = aes(x = -log10(padj_edgeR), y = -log10(padj_anova))) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Comparison of adjusted P-values", x = "-log10 P-value (egdeR)", y = "-log10 P-value (ANOVA)")

df <- as.data.frame(cbind(lrt_all$table$PValue, res_DE$pval))
colnames(df) <- c("p_edgeR", "p_anova")

ggplot(data = df, mapping = aes(x = -log10(p_edgeR), y = -log10(p_anova))) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "blue") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(title = "Comparison of P-values", x = "-log10 P-value (egdeR)", y = "-log10 P-value (ANOVA)")

Comparison of logFC:

Grouping of the identified DEGs

DEG <- de_genes_edgeR

avg_expr <- as.data.frame(sapply(sort(unique(anno$age))[1:5], function(age) {
  rowMeans(expr[, which(anno$age == age)])
}))

avg_expr$V6 <- unlist(as.vector(expr[, which(anno$age == 26)]))
avg_expr$V7 <- unlist(as.vector(expr[, which(anno$age == 28)]))

colnames(avg_expr) <- sort(unique(anno$age))

max_age_DEG <-
  setNames(colnames(avg_expr)[apply(avg_expr[DEG, ], 1, which.max)], DEG)
table(max_age_DEG)
max_age_DEG
12 18 21 26 28  3 
 4  2  3 25 19 19 
avg_expr_DEG_list <- tapply(names(max_age_DEG), max_age_DEG, function(x) avg_expr[x,])
scaled_expr_DEG_list <- lapply(avg_expr_DEG_list, function(x) t(scale(t(x))))

layout(matrix(1:8, nrow = 2, byrow = T))
par(mar=c(3,3,3,3))
for(age in names(scaled_expr_DEG_list))
  boxplot(scaled_expr_DEG_list[[age]],
          main = paste0(age, " (", nrow(scaled_expr_DEG_list[[age]]), ")"))

corr_DEG <- cor(t(avg_expr[DEG,]), method = "spearman")
hcl_DEG <- hclust(as.dist(1 - corr_DEG), method = "complete")
plot(hcl_DEG)

cl_DEG <- cutree(hcl_DEG, k = 4)
heatmap.2(corr_DEG, Rowv = as.dendrogram(hcl_DEG), Colv = as.dendrogram(hcl_DEG),
          trace = "none", scale = "none", labRow = NA, labCol = NA, col = viridis,
          ColSideColors = rainbow(15)[cl_DEG])

avg_expr_DEG_list <- tapply(names(cl_DEG), cl_DEG, function(x) avg_expr[x,])
scaled_expr_DEG_list <- lapply(avg_expr_DEG_list, function(x) t(scale(t(x))))

layout(matrix(1:4, nrow = 2, byrow = T))
for(cl in 1:4)
  boxplot(scaled_expr_DEG_list[[cl]],
          main = paste0(cl, " (", nrow(scaled_expr_DEG_list[[cl]]), ")"))

# cluster 1:
rownames(scaled_expr_DEG_list[[1]])
 [1] "ENSMUSG00000004894.11" "ENSMUSG00000013523.14" "ENSMUSG00000020932.15"
 [4] "ENSMUSG00000022090.11" "ENSMUSG00000022548.15" "ENSMUSG00000025889.14"
 [7] "ENSMUSG00000027199.15" "ENSMUSG00000027500.11" "ENSMUSG00000027669.15"
[10] "ENSMUSG00000030701.18" "ENSMUSG00000045193.14" "ENSMUSG00000051242.2" 
[13] "ENSMUSG00000068129.6"  "ENSMUSG00000069516.9"  "ENSMUSG00000073418.5" 
[16] "ENSMUSG00000076441.10" "ENSMUSG00000076609.3"  "ENSMUSG00000091049.9" 
[19] "ENSMUSG00000091898.9"  "ENSMUSG00000092178.3"  "ENSMUSG00000092274.5" 
[22] "ENSMUSG00000104674.3"  "ENSMUSG00000108158.2"  "ENSMUSG00000109006.4" 
[25] "ENSMUSG00000113342.2" 
print("")
[1] ""
# cluster 2:
rownames(scaled_expr_DEG_list[[2]])
 [1] "ENSMUSG00000009075.3"  "ENSMUSG00000016256.11" "ENSMUSG00000018593.14"
 [4] "ENSMUSG00000025888.7"  "ENSMUSG00000026822.15" "ENSMUSG00000027356.9" 
 [7] "ENSMUSG00000032890.18" "ENSMUSG00000040097.17" "ENSMUSG00000041828.16"
[10] "ENSMUSG00000042066.17" "ENSMUSG00000046613.20" "ENSMUSG00000051599.5" 
[13] "ENSMUSG00000060550.17" "ENSMUSG00000066129.18" "ENSMUSG00000073409.13"
[16] "ENSMUSG00000079017.4"  "ENSMUSG00000090667.3"  "ENSMUSG00000093773.2" 
[19] "ENSMUSG00000103472.2"  "ENSMUSG00000103897.2"  "ENSMUSG00000121378.1" 
print("")
[1] ""
# cluster 3:
rownames(scaled_expr_DEG_list[[3]])
[1] "ENSMUSG00000009376.16" "ENSMUSG00000022076.11" "ENSMUSG00000027875.13"
[4] "ENSMUSG00000028347.15" "ENSMUSG00000047945.7"  "ENSMUSG00000052229.6" 
[7] "ENSMUSG00000073164.12" "ENSMUSG00000078377.6"  "ENSMUSG00000102422.2" 
print("")
[1] ""
# cluster 4:
rownames(scaled_expr_DEG_list[[4]])
 [1] "ENSMUSG00000025316.17" "ENSMUSG00000028222.3"  "ENSMUSG00000031765.9" 
 [4] "ENSMUSG00000033906.6"  "ENSMUSG00000038128.8"  "ENSMUSG00000038530.12"
 [7] "ENSMUSG00000039114.17" "ENSMUSG00000042599.9"  "ENSMUSG00000044847.14"
[10] "ENSMUSG00000050192.9"  "ENSMUSG00000064115.14" "ENSMUSG00000070392.6" 
[13] "ENSMUSG00000075602.11" "ENSMUSG00000079018.11" "ENSMUSG00000107023.3" 
[16] "ENSMUSG00000110249.2"  "ENSMUSG00000112640.2" 

Making sense of the genes

Frequency-based method DAVID (Not optimal as we have low number of DE genes)

#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))

write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==2)), "ensembl_gene_id"],
            file = "genes_C2.txt",
            quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
            file = "genes_expressed.txt",
            quote = F, row.names = F, col.names = F)

Cluster 3 is the one with down expression progressively across ages, therefore:

#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))

write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==3)), "ensembl_gene_id"],
            file = "genes_C3.txt",
            quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
            file = "genes_expressed.txt",
            quote = F, row.names = F, col.names = F)

Cluster 1 is the one with up expression progressively across ages, therefore:

#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))

write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==1)), "ensembl_gene_id"],
            file = "genes_C1.txt",
            quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
            file = "genes_expressed.txt",
            quote = F, row.names = F, col.names = F)

And now cluster 4:

#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))

write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==4)), "ensembl_gene_id"],
            file = "genes_C4.txt",
            quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
            file = "genes_expressed.txt",
            quote = F, row.names = F, col.names = F)

Rank-based method GSEA

DE_A26 <- DE_test(expr = expr[meta_genes$expressed,],
                 cond = anno$age == "26",
                 ctrl = "FALSE",
                 covar = anno %>% dplyr::select(sex)) %>%
  tibble::rownames_to_column("gene")

scores <- setNames(sign(log(DE_A26$fc)) * (-log10(DE_A26$pval)),
                   setNames(meta_genes$ensembl_gene_id,
                            meta_genes$ensembl_gene_id_version)[DE_A26$gene])
scores_ordered <- sort(scores, decreasing=T)

library(msigdbr)
genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)

library(fgsea)
fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
                    stats = scores_ordered[-1],
                    minSize  = 15,
                    maxSize  = 500)
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.34% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : There were 1 pathways for which P-values were not calculated
properly due to unbalanced (positive and negative) gene-level statistic values.
For such pathways pval, padj, NES, log2err are set to NA. You can try to
increase the value of the argument nPermSimple (for example set it nPermSimple
= 10000)
fgsea_kegg[order(NES,decreasing=T),][1:10,]
                                                                 pathway
 1:                                                 LEI_HOXC8_TARGETS_DN
 2:                                            ZHANG_INTERFERON_RESPONSE
 3: REACTOME_BUTYRATE_RESPONSE_FACTOR_1_BRF1_BINDS_AND_DESTABILIZES_MRNA
 4:                                                   KIM_LRRC3B_TARGETS
 5:       REACTOME_TRISTETRAPROLIN_TTP_ZFP36_BINDS_AND_DESTABILIZES_MRNA
 6:               WP_HAIR_FOLLICLE_DEVELOPMENT_ORGANOGENESIS_PART_2_OF_3
 7:                             REACTOME_INTERFERON_ALPHA_BETA_SIGNALING
 8:                                 EINAV_INTERFERON_SIGNATURE_IN_CANCER
 9:                                         RADAEVA_RESPONSE_TO_IFNA1_UP
10:                                  BOYAULT_LIVER_CANCER_SUBCLASS_G5_DN
           pval      padj   log2err        ES      NES size
 1: 0.001904233 0.4432369 0.4550599 0.8103573 1.962052   16
 2: 0.005116386 0.4432369 0.4070179 0.7830670 1.956420   19
 3: 0.002291917 0.4432369 0.4317077 0.7516141 1.819821   16
 4: 0.007041256 0.4432369 0.4070179 0.6902862 1.787900   23
 5: 0.002498060 0.4432369 0.4317077 0.7326232 1.773840   16
 6: 0.011207564 0.4432369 0.3807304 0.6614050 1.771704   31
 7: 0.013389296 0.4432369 0.3807304 0.6125033 1.768737   52
 8: 0.003620144 0.4432369 0.4317077 0.6753948 1.744792   24
 9: 0.009164758 0.4432369 0.3807304 0.6131793 1.727130   43
10: 0.008353275 0.4432369 0.3807304 0.6639865 1.719782   23
                                                                                                              leadingEdge
 1:     ENSMUSG00000003849,ENSMUSG00000029304,ENSMUSG00000001349,ENSMUSG00000052957,ENSMUSG00000032085,ENSMUSG00000035783
 2: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000070034,ENSMUSG00000028037,ENSMUSG00000039236,ENSMUSG00000026104,...
 3:                                           ENSMUSG00000021127,ENSMUSG00000024472,ENSMUSG00000028322,ENSMUSG00000032410
 4: ENSMUSG00000035692,ENSMUSG00000028037,ENSMUSG00000060591,ENSMUSG00000060550,ENSMUSG00000060586,ENSMUSG00000060802,...
 5:                        ENSMUSG00000044786,ENSMUSG00000009470,ENSMUSG00000024472,ENSMUSG00000028322,ENSMUSG00000032410
 6: ENSMUSG00000022510,ENSMUSG00000015647,ENSMUSG00000041324,ENSMUSG00000001761,ENSMUSG00000040055,ENSMUSG00000028163,...
 7: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000053113,ENSMUSG00000039236,ENSMUSG00000060591,ENSMUSG00000060550,...
 8: ENSMUSG00000035692,ENSMUSG00000015846,ENSMUSG00000047407,ENSMUSG00000070034,ENSMUSG00000028037,ENSMUSG00000027078,...
 9: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000028037,ENSMUSG00000027078,ENSMUSG00000060591,ENSMUSG00000057329,...
10: ENSMUSG00000037820,ENSMUSG00000027907,ENSMUSG00000028037,ENSMUSG00000006519,ENSMUSG00000026104,ENSMUSG00000052397,...
library(msigdbr)
genesets_celltype <- msigdbr(species = "Mus musculus", category = "C8")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)

library(fgsea)
fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
                    stats = scores_ordered[-1],
                    minSize  = 15,
                    maxSize  = 500)
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.34% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
fgsea_kegg[order(NES,decreasing=T),][1:10,]
                                                                 pathway
 1:                     DESCARTES_FETAL_STOMACH_PARIETAL_AND_CHIEF_CELLS
 2:                              BUSSLINGER_GASTRIC_REG3A_POSITIVE_CELLS
 3: DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_VASCULAR_SMOOTH_MUSCLE_CELLS
 4:                            DESCARTES_FETAL_STOMACH_MESOTHELIAL_CELLS
 5:                    DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_PERICYTES
 6:                         RUBENSTEIN_SKELETAL_MUSCLE_ENDOTHELIAL_CELLS
 7:                                    AIZARANI_LIVER_C12_NK_NKT_CELLS_4
 8:    DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_FIBROBLASTS_STROMAL_CELLS
 9:                            FAN_EMBRYONIC_CTX_BIG_GROUPS_BRAIN_IMMUNE
10:                                     AIZARANI_LIVER_C17_HEPATOCYTES_3
           pval      padj   log2err        ES      NES size
 1: 0.002648619 0.3484108 0.4317077 0.6931856 1.758385   22
 2: 0.005354202 0.3484108 0.4070179 0.6488476 1.724220   29
 3: 0.044284243 0.3484108 0.2165428 0.5535044 1.618119   89
 4: 0.011193307 0.3484108 0.3807304 0.6286719 1.601858   23
 5: 0.044605809 0.3484108 0.2165428 0.5543262 1.598670   75
 6: 0.052261307 0.3484108 0.1957890 0.5336202 1.579324  126
 7: 0.012336074 0.3484108 0.3807304 0.5906395 1.569540   29
 8: 0.044375645 0.3484108 0.2165428 0.5392578 1.563221   78
 9: 0.052208835 0.3484108 0.1957890 0.5205918 1.544664  130
10: 0.032188841 0.3484108 0.2616635 0.5403901 1.540768   54
                                                                                                              leadingEdge
 1: ENSMUSG00000028553,ENSMUSG00000032278,ENSMUSG00000019326,ENSMUSG00000019232,ENSMUSG00000041372,ENSMUSG00000051497,...
 2: ENSMUSG00000025492,ENSMUSG00000026822,ENSMUSG00000060550,ENSMUSG00000060586,ENSMUSG00000026104,ENSMUSG00000060802,...
 3: ENSMUSG00000001349,ENSMUSG00000012428,ENSMUSG00000044786,ENSMUSG00000029761,ENSMUSG00000053113,ENSMUSG00000036256,...
 4: ENSMUSG00000048482,ENSMUSG00000052957,ENSMUSG00000006649,ENSMUSG00000028871,ENSMUSG00000038119,ENSMUSG00000020363,...
 5: ENSMUSG00000037872,ENSMUSG00000025608,ENSMUSG00000009687,ENSMUSG00000053113,ENSMUSG00000038393,ENSMUSG00000001930,...
 6: ENSMUSG00000041445,ENSMUSG00000025492,ENSMUSG00000025608,ENSMUSG00000035692,ENSMUSG00000038393,ENSMUSG00000034485,...
 7: ENSMUSG00000042385,ENSMUSG00000038393,ENSMUSG00000021998,ENSMUSG00000004612,ENSMUSG00000025491,ENSMUSG00000030847,...
 8: ENSMUSG00000025927,ENSMUSG00000025492,ENSMUSG00000037820,ENSMUSG00000050370,ENSMUSG00000047330,ENSMUSG00000029761,...
 9: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000058715,ENSMUSG00000034165,ENSMUSG00000027907,ENSMUSG00000009687,...
10: ENSMUSG00000003617,ENSMUSG00000032081,ENSMUSG00000037095,ENSMUSG00000030895,ENSMUSG00000026874,ENSMUSG00000026193,...
DE_A28 <- DE_test(expr = expr[meta_genes$expressed,],
                 cond = anno$age == "28",
                 ctrl = "FALSE",
                 covar = anno %>% dplyr::select(sex)) %>%
  tibble::rownames_to_column("gene")

scores <- setNames(sign(log(DE_A28$fc)) * (-log10(DE_A28$pval)),
                   setNames(meta_genes$ensembl_gene_id,
                            meta_genes$ensembl_gene_id_version)[DE_A28$gene])
scores_ordered <- sort(scores, decreasing=T)

genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)

fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
                    stats = scores_ordered[-1],
                    minSize  = 15,
                    maxSize  = 500)
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.33% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
fgsea_kegg[order(NES,decreasing=T),][1:10,]
                                         pathway       pval      padj   log2err
 1:                 LEIN_OLIGODENDROCYTE_MARKERS 0.04676754 0.5640283 0.2450418
 2: BILANGES_SERUM_AND_RAPAMYCIN_SENSITIVE_GENES 0.03851641 0.5640283 0.2765006
 3:   REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION 0.05170068 0.5640283 0.2311267
 4:                    WP_PROTEASOME_DEGRADATION 0.02025484 0.5640283 0.3524879
 5:                           KIM_LRRC3B_TARGETS 0.01088003 0.5640283 0.3807304
 6:   REACTOME_EUKARYOTIC_TRANSLATION_INITIATION 0.06274510 0.5640283 0.2042948
 7:                                KEGG_RIBOSOME 0.04938272 0.5640283 0.2377938
 8:         REACTOME_SELENOAMINO_ACID_METABOLISM 0.06442953 0.5640283 0.2042948
 9:            WP_CYTOPLASMIC_RIBOSOMAL_PROTEINS 0.05226960 0.5640283 0.2311267
10:          CHNG_MULTIPLE_MYELOMA_HYPERPLOID_UP 0.02237833 0.5640283 0.3524879
           ES      NES size
 1: 0.6375030 2.040223   75
 2: 0.6285894 2.020904   69
 3: 0.6129909 1.983928   88
 4: 0.6194357 1.957097   60
 5: 0.7254219 1.954793   23
 6: 0.5989848 1.950601  115
 7: 0.6001111 1.934551   83
 8: 0.6001178 1.932356  112
 9: 0.6000208 1.927173   86
10: 0.6141116 1.911793   53
                                                                                                              leadingEdge
 1: ENSMUSG00000011884,ENSMUSG00000045087,ENSMUSG00000036098,ENSMUSG00000026519,ENSMUSG00000036503,ENSMUSG00000050121,...
 2: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000061477,ENSMUSG00000002395,...
 3: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000060036,ENSMUSG00000061477,...
 4: ENSMUSG00000091705,ENSMUSG00000005625,ENSMUSG00000016206,ENSMUSG00000031429,ENSMUSG00000018286,ENSMUSG00000030603,...
 5: ENSMUSG00000079339,ENSMUSG00000028037,ENSMUSG00000024661,ENSMUSG00000035042,ENSMUSG00000037321,ENSMUSG00000069516,...
 6: ENSMUSG00000070319,ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000016554,ENSMUSG00000041453,ENSMUSG00000029614,...
 7: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000060036,ENSMUSG00000061477,...
 8: ENSMUSG00000026986,ENSMUSG00000040354,ENSMUSG00000031584,ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000063179,...
 9: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000060036,ENSMUSG00000061477,...
10: ENSMUSG00000070319,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000032376,ENSMUSG00000062997,ENSMUSG00000024608,...
DE_A3 <- DE_test(expr = expr[meta_genes$expressed,],
                 cond = anno$age == "3",
                 ctrl = "FALSE",
                 covar = anno %>% dplyr::select(sex)) %>%
  tibble::rownames_to_column("gene")

scores <- setNames(sign(log(DE_A3$fc)) * (-log10(DE_A3$pval)),
                   setNames(meta_genes$ensembl_gene_id,
                            meta_genes$ensembl_gene_id_version)[DE_A3$gene])
scores_ordered <- sort(scores, decreasing=T)

genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)

fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
                    stats = scores_ordered,
                    minSize  = 15,
                    maxSize  = 500)
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
fgsea_kegg[order(NES,decreasing=T),][1:10,]
                                                                                                                      pathway
 1: REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS
 2:                                                                                   REACTOME_RESPIRATORY_ELECTRON_TRANSPORT
 3:                                                                 WP_ELECTRON_TRANSPORT_CHAIN_OXPHOS_SYSTEM_IN_MITOCHONDRIA
 4:                                                     REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT
 5:                                                                                             REACTOME_COMPLEX_I_BIOGENESIS
 6:                                                                   WP_MITOCHONDRIAL_COMPLEX_I_ASSEMBLY_MODEL_OXPHOS_SYSTEM
 7:                                                                                            KEGG_OXIDATIVE_PHOSPHORYLATION
 8:                                                                                              WP_OXIDATIVE_PHOSPHORYLATION
 9:                                                                                                   KEGG_PARKINSONS_DISEASE
10:                                                                                              MALONEY_RESPONSE_TO_17AAG_DN
            pval         padj   log2err        ES      NES size
 1: 3.862413e-16 1.736541e-12 1.0376962 0.6616233 2.526326  125
 2: 1.180405e-14 2.653551e-11 0.9865463 0.6789028 2.522467  102
 3: 1.871573e-12 2.103648e-09 0.8986712 0.6550337 2.426148  101
 4: 1.772006e-14 2.655646e-11 0.9759947 0.6038376 2.404693  171
 5: 1.051930e-08 5.254977e-06 0.7477397 0.6813267 2.299007   56
 6: 2.132274e-08 9.586704e-06 0.7337620 0.6807924 2.285191   55
 7: 9.838494e-11 8.846773e-08 0.8390889 0.5921459 2.255667  122
 8: 5.273679e-07 1.481904e-04 0.6594444 0.6427689 2.194115   59
 9: 3.723448e-09 2.092578e-06 0.7749390 0.5659178 2.151906  121
10: 1.412064e-07 4.232426e-05 0.6901325 0.6025443 2.145079   77
                                                                                                              leadingEdge
 1: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
 2: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
 3: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
 4: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
 5: ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064367,ENSMUSG00000035142,ENSMUSG00000064341,ENSMUSG00000064368,...
 6: ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064367,ENSMUSG00000035142,ENSMUSG00000064341,ENSMUSG00000064368,...
 7: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
 8: ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064367,ENSMUSG00000064341,ENSMUSG00000064357,ENSMUSG00000064368,...
 9: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000025889,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,...
10: ENSMUSG00000025889,ENSMUSG00000038489,ENSMUSG00000009013,ENSMUSG00000038607,ENSMUSG00000022205,ENSMUSG00000057278,...
DE_A12 <- DE_test(expr = expr[meta_genes$expressed,],
                 cond = anno$age == "12",
                 ctrl = "FALSE",
                 covar = anno %>% dplyr::select(sex)) %>%
  tibble::rownames_to_column("gene")

scores <- setNames(sign(log(DE_A12$fc)) * (-log10(DE_A12$pval)),
                   setNames(meta_genes$ensembl_gene_id,
                            meta_genes$ensembl_gene_id_version)[DE_A12$gene])
scores_ordered <- sort(scores, decreasing=T)

genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)

fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
                    stats = scores_ordered,
                    minSize  = 15,
                    maxSize  = 500)
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.44% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : There were 33 pathways for which P-values were not calculated
properly due to unbalanced (positive and negative) gene-level statistic values.
For such pathways pval, padj, NES, log2err are set to NA. You can try to
increase the value of the argument nPermSimple (for example set it nPermSimple
= 10000)
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
fgsea_kegg[order(NES,decreasing=T),][1:10,]
                                                                                             pathway
 1:                                                                            PID_EPHA2_FWD_PATHWAY
 2:                                                     REACTOME_INTERACTION_BETWEEN_L1_AND_ANKYRINS
 3:                                                                                  PID_HIF2PATHWAY
 4:                                                        STARK_PREFRONTAL_CORTEX_22Q11_DELETION_UP
 5:                                                                             PID_RHOA_REG_PATHWAY
 6:                                                             DACOSTA_UV_RESPONSE_VIA_ERCC3_TTD_DN
 7:                                                         REACTOME_NRAGE_SIGNALS_DEATH_THROUGH_JNK
 8:                                                                             PID_UPA_UPAR_PATHWAY
 9: WP_MFAP5_EFFECT_ON_PERMEABILITY_AND_MOTILITY_OF_ENDOTHELIAL_CELLS_VIA_CYTOSKELETON_REARRANGEMENT
10:                                                                   GENTILE_UV_RESPONSE_CLUSTER_D4
            pval         padj   log2err        ES      NES size
 1: 6.534150e-05 4.147977e-03 0.5384341 0.7627557 1.860449   19
 2: 7.441880e-05 4.485923e-03 0.5384341 0.6984472 1.811016   28
 3: 1.127958e-04 5.861183e-03 0.5384341 0.6815470 1.787110   31
 4: 1.946899e-12 4.344504e-09 0.8986712 0.5771378 1.780627  194
 5: 3.555479e-05 2.783878e-03 0.5573322 0.6339556 1.774314   46
 6: 2.536675e-06 3.573722e-04 0.6272567 0.5902205 1.748159   82
 7: 1.980133e-05 1.747264e-03 0.5756103 0.6089109 1.743069   57
 8: 2.131539e-04 9.513058e-03 0.5188481 0.6646845 1.731582   29
 9: 7.830164e-04 2.314306e-02 0.4772708 0.7153930 1.727642   18
10: 7.336258e-05 4.485923e-03 0.5384341 0.6042992 1.718746   54
                                                                                                              leadingEdge
 1: ENSMUSG00000034342,ENSMUSG00000033721,ENSMUSG00000027954,ENSMUSG00000032737,ENSMUSG00000002489,ENSMUSG00000027646,...
 2: ENSMUSG00000056258,ENSMUSG00000031543,ENSMUSG00000069601,ENSMUSG00000020315,ENSMUSG00000057738,ENSMUSG00000064329,...
 3: ENSMUSG00000036450,ENSMUSG00000027954,ENSMUSG00000030103,ENSMUSG00000024140,ENSMUSG00000009406,ENSMUSG00000062960,...
 4: ENSMUSG00000029673,ENSMUSG00000056608,ENSMUSG00000050310,ENSMUSG00000042364,ENSMUSG00000005089,ENSMUSG00000047013,...
 5: ENSMUSG00000035133,ENSMUSG00000033721,ENSMUSG00000066406,ENSMUSG00000004677,ENSMUSG00000031442,ENSMUSG00000058230,...
 6: ENSMUSG00000031586,ENSMUSG00000037646,ENSMUSG00000035614,ENSMUSG00000038126,ENSMUSG00000033278,ENSMUSG00000001151,...
 7: ENSMUSG00000033721,ENSMUSG00000002489,ENSMUSG00000021708,ENSMUSG00000066406,ENSMUSG00000031442,ENSMUSG00000028059,...
 8: ENSMUSG00000027087,ENSMUSG00000040249,ENSMUSG00000058325,ENSMUSG00000027646,ENSMUSG00000031955,ENSMUSG00000026193,...
 9: ENSMUSG00000030516,ENSMUSG00000021823,ENSMUSG00000029528,ENSMUSG00000027087,ENSMUSG00000022836,ENSMUSG00000013936,...
10: ENSMUSG00000030103,ENSMUSG00000032702,ENSMUSG00000055065,ENSMUSG00000040761,ENSMUSG00000028479,ENSMUSG00000021838,...
plotEnrichment(genesets_celltype_list[["PID_EPHA2_FWD_PATHWAY"]],scores_ordered) + labs(title="PID_EPHA2_FWD_PATHWAY")

fgsea_kegg[order(NES,decreasing=F),][1:10,1:7]
                                                      pathway         pval
 1: WP_ELECTRON_TRANSPORT_CHAIN_OXPHOS_SYSTEM_IN_MITOCHONDRIA 4.393021e-10
 2:                             REACTOME_COMPLEX_I_BIOGENESIS 7.298615e-07
 3:                              WP_OXIDATIVE_PHOSPHORYLATION 7.669988e-07
 4:                   REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 1.933683e-07
 5:                           REACTOME_ANTIMICROBIAL_PEPTIDES 6.578347e-04
 6:   WP_MITOCHONDRIAL_COMPLEX_I_ASSEMBLY_MODEL_OXPHOS_SYSTEM 6.598843e-05
 7:                              LEIN_OLIGODENDROCYTE_MARKERS 6.962380e-06
 8:                                          KORKOLA_TERATOMA 2.086173e-04
 9:                      KEGG_DRUG_METABOLISM_CYTOCHROME_P450 5.650876e-04
10:                                            MOOTHA_VOXPHOS 4.863726e-06
            padj   log2err         ES       NES size
 1: 3.797451e-07 0.8140358 -0.4826310 -2.503480  101
 2: 1.551129e-04 0.6594444 -0.5342649 -2.359534   56
 3: 1.555962e-04 0.6594444 -0.5096109 -2.263485   59
 4: 6.164306e-05 0.6901325 -0.4280511 -2.240438  102
 5: 2.038831e-02 0.4772708 -0.6164926 -2.122986   21
 6: 4.147977e-03 0.5384341 -0.4763504 -2.104925   55
 7: 7.226303e-04 0.6105269 -0.4253281 -2.081980   75
 8: 9.404635e-03 0.5188481 -0.5195085 -2.011514   36
 9: 1.801419e-02 0.4772708 -0.5205306 -1.950327   31
10: 5.866705e-04 0.6105269 -0.4095270 -1.941403   87